{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "import tensorflow_datasets as tfds\n",
    "from t5.data import preprocessors as prep\n",
    "import functools\n",
    "import t5\n",
    "import gin\n",
    "import sentencepiece as spm\n",
    "from glob import glob\n",
    "import os\n",
    "\n",
    "gin.parse_config_file('pretrained_models_base_operative_config.gin')\n",
    "vocab = 'sp10m.cased.ms-en.model'\n",
    "sp = spm.SentencePieceProcessor()\n",
    "sp.Load(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = [\n",
    "    '/home/husein/pure-text/dumping-parliament.txt',\n",
    "    '/home/husein/pure-text/filtered-dumping-wiki.txt',\n",
    "    '/home/husein/pure-text/filtered-dumping-academia.txt',\n",
    "    '/home/husein/pure-text/dumping-news.txt'\n",
    "]\n",
    "files.extend(glob('/home/husein/pure-text/the-pile/*.txt'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def cleaning(string):\n",
    "    string = string.replace('\\n', ' ').replace('\\t', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open(files[-2]) as fopen:\n",
    "#     data = fopen.read().split('\\n')\n",
    "# results, result = [], []\n",
    "# for i in data:\n",
    "#     if not len(i) and len(result):\n",
    "#         results.append(' '.join(result))\n",
    "#         result = []\n",
    "#     else:\n",
    "#         if len(i):\n",
    "#             result.append(i)\n",
    "# if len(result):\n",
    "#     results.append(' '.join(result))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/husein/pure-text/dumping-parliament.txt 69487\n",
      "/home/husein/pure-text/filtered-dumping-wiki.txt 363179\n",
      "/home/husein/pure-text/filtered-dumping-academia.txt 951674\n",
      "/home/husein/pure-text/dumping-news.txt 189786\n",
      "/home/husein/pure-text/the-pile/00.jsonl-16.translated.txt 56289\n",
      "/home/husein/pure-text/the-pile/00.jsonl-17.translated.txt 54941\n",
      "/home/husein/pure-text/the-pile/00.jsonl-18.translated.txt 54978\n",
      "/home/husein/pure-text/the-pile/00.jsonl-0.translated.txt 22498\n",
      "/home/husein/pure-text/the-pile/00.jsonl-31.translated.txt 55411\n",
      "/home/husein/pure-text/the-pile/00.jsonl-11.translated.txt 33000\n",
      "/home/husein/pure-text/the-pile/00.jsonl-8.translated.txt 33907\n",
      "/home/husein/pure-text/the-pile/00.jsonl-28.translated.txt 55562\n",
      "/home/husein/pure-text/the-pile/00.jsonl-26.translated.txt 55366\n",
      "/home/husein/pure-text/the-pile/00.jsonl-6.translated.txt 34305\n",
      "/home/husein/pure-text/the-pile/00.jsonl-25.translated.txt 55575\n",
      "/home/husein/pure-text/the-pile/00.jsonl-32.translated.txt 55309\n",
      "/home/husein/pure-text/the-pile/00.jsonl-13.translated.txt 33211\n",
      "/home/husein/pure-text/the-pile/00.jsonl-19.translated.txt 55080\n",
      "/home/husein/pure-text/the-pile/00.jsonl-5.translated.txt 33939\n",
      "/home/husein/pure-text/the-pile/00.jsonl-9.translated.txt 36030\n",
      "/home/husein/pure-text/the-pile/00.jsonl-4.translated.txt 22538\n",
      "/home/husein/pure-text/the-pile/00.jsonl-24.translated.txt 54895\n",
      "/home/husein/pure-text/the-pile/00.jsonl-10.translated.txt 32973\n",
      "/home/husein/pure-text/the-pile/00.jsonl-23.translated.txt 55181\n",
      "/home/husein/pure-text/the-pile/00.jsonl-29.translated.txt 55429\n",
      "/home/husein/pure-text/the-pile/00.jsonl-15.translated.txt 55166\n",
      "/home/husein/pure-text/the-pile/00.jsonl-7.translated.txt 34428\n",
      "/home/husein/pure-text/the-pile/00.jsonl-30.translated.txt 55096\n",
      "/home/husein/pure-text/the-pile/00.jsonl-1.translated.txt 22714\n",
      "/home/husein/pure-text/the-pile/00.jsonl-2.translated.txt 37595\n",
      "/home/husein/pure-text/the-pile/00.jsonl-14.translated.txt 55458\n",
      "/home/husein/pure-text/the-pile/00.jsonl-21.translated.txt 55547\n",
      "/home/husein/pure-text/the-pile/00.jsonl-3.translated.txt 22794\n",
      "/home/husein/pure-text/the-pile/00.jsonl-27.translated.txt 55160\n",
      "/home/husein/pure-text/the-pile/00.jsonl-20.translated.txt 55109\n",
      "/home/husein/pure-text/the-pile/00.jsonl-22.translated.txt 55171\n",
      "/home/husein/pure-text/the-pile/00.jsonl-12.translated.txt 32965\n"
     ]
    }
   ],
   "source": [
    "for file in files:\n",
    "    with open(file) as fopen:\n",
    "        data = fopen.read().split('\\n')\n",
    "    results, result = [], []\n",
    "    for i in data:\n",
    "        if not len(i) and len(result):\n",
    "            results.append(' '.join(result))\n",
    "            result = []\n",
    "        else:\n",
    "            if len(i) > 10:\n",
    "                result.append(i)\n",
    "    if len(result):\n",
    "        results.append(' '.join(result))\n",
    "        \n",
    "    print(file, len(results))\n",
    "    s = os.path.split(file)[1]\n",
    "    filename = f'{s}-pair.tsv'\n",
    "    \n",
    "    with tf.io.gfile.GFile(filename, 'w') as outfile:\n",
    "        for i in range(len(results)):\n",
    "            outfile.write('%s\\t\\n' % (cleaning(results[i])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.6/site-packages/t5/models/mesh_transformer.py:210: UserWarning: get_sentencepiece_model_path is deprecated. Please pass the mixture or task vocabulary directly to the Mesh TensorFlow Transformer instead.\n",
      "  \"get_sentencepiece_model_path is deprecated. Please pass the mixture or \"\n"
     ]
    }
   ],
   "source": [
    "def pair_dataset(split, shuffle_files = False):\n",
    "    del shuffle_files\n",
    "    ds = tf.data.TextLineDataset(\n",
    "        glob('*pair.tsv')\n",
    "    )\n",
    "\n",
    "    ds = ds.map(\n",
    "        functools.partial(\n",
    "            tf.io.decode_csv,\n",
    "            record_defaults = ['', ''],\n",
    "            field_delim = '\\t',\n",
    "            use_quote_delim = False,\n",
    "        ),\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )\n",
    "    ds = ds.map(lambda *ex: dict(zip(['text'], ex)))\n",
    "    return ds\n",
    "\n",
    "\n",
    "t5.data.TaskRegistry.remove('pair_dataset')\n",
    "t5.data.TaskRegistry.add(\n",
    "    'pair_dataset',\n",
    "    dataset_fn = pair_dataset,\n",
    "    splits = ['train'],\n",
    "    text_preprocessor = [prep.next_sentence_prediction],\n",
    "    sentencepiece_model_path = vocab,\n",
    "    metric_fns = [t5.evaluation.metrics.accuracy],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Entity <function neighboring_pairs.<locals>.<lambda> at 0x7fa8c70f5400> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <function neighboring_pairs.<locals>.<lambda> at 0x7fa8c70f5400>, which Python reported as:\n",
      "      lambda x: x[text_key], num_parallel_calls=tf.data.experimental.AUTOTUNE)\n",
      "\n",
      "If this is a lambda function, the error may be avoided by creating the lambda in a standalone statement.\n",
      "WARNING: Entity <function neighboring_pairs.<locals>.<lambda> at 0x7fa8c70f5400> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <function neighboring_pairs.<locals>.<lambda> at 0x7fa8c70f5400>, which Python reported as:\n",
      "      lambda x: x[text_key], num_parallel_calls=tf.data.experimental.AUTOTUNE)\n",
      "\n",
      "If this is a lambda function, the error may be avoided by creating the lambda in a standalone statement.\n"
     ]
    }
   ],
   "source": [
    "nq_task = t5.data.TaskRegistry.get(\"pair_dataset\")\n",
    "ds = nq_task.get_dataset(split='qa.tsv', sequence_length={\"inputs\": 1024, \"targets\": 1024})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "r = tfds.as_numpy(ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'inputs_plaintext': b'nsp: Ia diperkenalkan di Colombia pada 1998, namun tidak begitu mendapat sambutan ketika itu. Daging wagyu, iaitu daging lembu Kobe berasal dari Jepun dikenali sebagai daging terbaik di dunia berdasarkan teksturnya yang lembut dan rasanya enak.',\n",
       " 'inputs': array([ 2532,  7175,    50,   132,  1707,    18,  5658,    23,  9544,\n",
       "          354,    29,   700,   231,  2023,   110,    24,     3, 18288,\n",
       "         2522,   562,  2470,    14,    92,  2312,  3927, 20959,  1059,\n",
       "           37,   437,   307,    41,  2312,   807,    18,   170,   468,\n",
       "        20640,    26,    17,  6349,    16, 13318, 17382,     3,     1]),\n",
       " 'targets_plaintext': b'not_next',\n",
       " 'targets': array([ 699, 3835,  144, 2990,   81,    1])}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
